Introduction

Libraries

For this assignment, the following libraries are used:

library(GGally)
library(ggplot2)
library(dplyr)
library(gridExtra)
library(ggcorrplot)
library(dendextend)
library(dbscan)
library(factoextra)
library(mclust)
library(cluster)
library(igraph)
library(clValid)
library(cluster)

Data

dataset <- read.csv("../datasets/penguindata.csv")
summary(dataset)
##        X          bill_length_mm  bill_depth_mm   flipper_length_mm
##  Min.   :  1.00   Min.   :32.10   Min.   :13.10   Min.   :172.0    
##  1st Qu.: 86.75   1st Qu.:39.23   1st Qu.:15.60   1st Qu.:190.0    
##  Median :172.50   Median :44.45   Median :17.30   Median :197.0    
##  Mean   :172.50   Mean   :43.92   Mean   :17.15   Mean   :200.9    
##  3rd Qu.:258.25   3rd Qu.:48.50   3rd Qu.:18.70   3rd Qu.:213.0    
##  Max.   :344.00   Max.   :59.60   Max.   :21.50   Max.   :231.0    
##                   NA's   :2       NA's   :2       NA's   :2        
##   body_mass_g       sex                 year     
##  Min.   :2700   Length:344         Min.   :2007  
##  1st Qu.:3550   Class :character   1st Qu.:2007  
##  Median :4050   Mode  :character   Median :2008  
##  Mean   :4202                      Mean   :2008  
##  3rd Qu.:4750                      3rd Qu.:2009  
##  Max.   :6300                      Max.   :2009  
##  NA's   :2
str(dataset)
## 'data.frame':    344 obs. of  7 variables:
##  $ X                : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ bill_length_mm   : num  39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
##  $ bill_depth_mm    : num  18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
##  $ flipper_length_mm: int  181 186 195 NA 193 190 181 195 193 190 ...
##  $ body_mass_g      : int  3750 3800 3250 NA 3450 3650 3625 4675 3475 4250 ...
##  $ sex              : chr  "male" "female" "female" NA ...
##  $ year             : int  2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...
df <- dataset
df <- subset(df, select = -X)
df <- df %>% mutate(bill_length_mm = ifelse(is.na(bill_length_mm), mean(bill_length_mm, na.rm = TRUE), bill_length_mm))
df <- df %>% mutate(bill_depth_mm = ifelse(is.na(bill_depth_mm), mean(bill_depth_mm, na.rm = TRUE), bill_depth_mm))
df <- df %>% mutate(flipper_length_mm = ifelse(is.na(flipper_length_mm), mean(flipper_length_mm, na.rm = TRUE), flipper_length_mm))
df <- df %>% mutate(body_mass_g = ifelse(is.na(body_mass_g), mean(body_mass_g, na.rm = TRUE), body_mass_g))
df <- df %>% mutate(sex = ifelse(is.na(sex), median(sex, na.rm = TRUE), sex))
df <- df %>% mutate(year = ifelse(is.na(year), median(year, na.rm = TRUE), year))
summary(df)
##  bill_length_mm  bill_depth_mm   flipper_length_mm  body_mass_g  
##  Min.   :32.10   Min.   :13.10   Min.   :172.0     Min.   :2700  
##  1st Qu.:39.27   1st Qu.:15.60   1st Qu.:190.0     1st Qu.:3550  
##  Median :44.25   Median :17.30   Median :197.0     Median :4050  
##  Mean   :43.92   Mean   :17.15   Mean   :200.9     Mean   :4202  
##  3rd Qu.:48.50   3rd Qu.:18.70   3rd Qu.:213.0     3rd Qu.:4750  
##  Max.   :59.60   Max.   :21.50   Max.   :231.0     Max.   :6300  
##      sex                 year     
##  Length:344         Min.   :2007  
##  Class :character   1st Qu.:2007  
##  Mode  :character   Median :2008  
##                     Mean   :2008  
##                     3rd Qu.:2009  
##                     Max.   :2009
# sex 1,2
# year 1,2,3
df <- df %>% mutate(sex = ifelse(sex == "male", 1, 2))
# [0,1]
df$year <- (df$year - min(df$year)) / (max(df$year) - min(df$year))

summary(df)
##  bill_length_mm  bill_depth_mm   flipper_length_mm  body_mass_g  
##  Min.   :32.10   Min.   :13.10   Min.   :172.0     Min.   :2700  
##  1st Qu.:39.27   1st Qu.:15.60   1st Qu.:190.0     1st Qu.:3550  
##  Median :44.25   Median :17.30   Median :197.0     Median :4050  
##  Mean   :43.92   Mean   :17.15   Mean   :200.9     Mean   :4202  
##  3rd Qu.:48.50   3rd Qu.:18.70   3rd Qu.:213.0     3rd Qu.:4750  
##  Max.   :59.60   Max.   :21.50   Max.   :231.0     Max.   :6300  
##       sex            year       
##  Min.   :1.00   Min.   :0.0000  
##  1st Qu.:1.00   1st Qu.:0.0000  
##  Median :1.00   Median :0.5000  
##  Mean   :1.48   Mean   :0.5145  
##  3rd Qu.:2.00   3rd Qu.:1.0000  
##  Max.   :2.00   Max.   :1.0000
str(df)
## 'data.frame':    344 obs. of  6 variables:
##  $ bill_length_mm   : num  39.1 39.5 40.3 43.9 36.7 ...
##  $ bill_depth_mm    : num  18.7 17.4 18 17.2 19.3 ...
##  $ flipper_length_mm: num  181 186 195 201 193 ...
##  $ body_mass_g      : num  3750 3800 3250 4202 3450 ...
##  $ sex              : num  1 2 2 1 2 1 2 1 1 1 ...
##  $ year             : num  0 0 0 0 0 0 0 0 0 0 ...

Models

Once the data is prepared, it’s ready to be used in the models. In this section, each model is given three random subsets of the cleaned dataset to classify as one of the class labels. The models are as follows:

Each run of the model will output the confusion matrix and the evaluation of each run. Later on, it will be studied and commented on in the Results section. Additionally, the results of each run will be displayed immediately after every model, as shown in the following sections for each model.

set.seed(2828)

K-Means Clustering

penguins <- df
penguins$bill_length_mm <- as.numeric(penguins$bill_length_mm)
penguins$bill_depth_mm <- as.numeric(penguins$bill_depth_mm)
penguins$flipper_length_mm <- as.numeric(penguins$flipper_length_mm)
penguins$body_mass_g <- as.numeric(penguins$body_mass_g)

summary(penguins)
##  bill_length_mm  bill_depth_mm   flipper_length_mm  body_mass_g  
##  Min.   :32.10   Min.   :13.10   Min.   :172.0     Min.   :2700  
##  1st Qu.:39.27   1st Qu.:15.60   1st Qu.:190.0     1st Qu.:3550  
##  Median :44.25   Median :17.30   Median :197.0     Median :4050  
##  Mean   :43.92   Mean   :17.15   Mean   :200.9     Mean   :4202  
##  3rd Qu.:48.50   3rd Qu.:18.70   3rd Qu.:213.0     3rd Qu.:4750  
##  Max.   :59.60   Max.   :21.50   Max.   :231.0     Max.   :6300  
##       sex            year       
##  Min.   :1.00   Min.   :0.0000  
##  1st Qu.:1.00   1st Qu.:0.0000  
##  Median :1.00   Median :0.5000  
##  Mean   :1.48   Mean   :0.5145  
##  3rd Qu.:2.00   3rd Qu.:1.0000  
##  Max.   :2.00   Max.   :1.0000
str(penguins)
## 'data.frame':    344 obs. of  6 variables:
##  $ bill_length_mm   : num  39.1 39.5 40.3 43.9 36.7 ...
##  $ bill_depth_mm    : num  18.7 17.4 18 17.2 19.3 ...
##  $ flipper_length_mm: num  181 186 195 201 193 ...
##  $ body_mass_g      : num  3750 3800 3250 4202 3450 ...
##  $ sex              : num  1 2 2 1 2 1 2 1 1 1 ...
##  $ year             : num  0 0 0 0 0 0 0 0 0 0 ...
fviz_nbclust(
  x = penguins, FUNcluster = pam, method = "wss", k.max = 15,
  diss = dist(penguins, method = "manhattan")
)

matriz_distancias <- dist(x = penguins, method = "euclidean")

hc_euclidea_completo <- hclust(d = matriz_distancias, method = "complete")
hc_euclidea_single <- hclust(d = matriz_distancias, method = "single")
hc_euclidea_average <- hclust(d = matriz_distancias, method = "average")

grid.arrange(
  plot(
    x = hc_euclidea_completo, cex = 0.6, xlab = "", ylab = "", sub = "",
    main = "Linkage complete"
  ),
  plot(
    x = hc_euclidea_single, cex = 0.6, xlab = "", ylab = "", sub = "",
    main = "Linkage single"
  ),
  plot(
    x = hc_euclidea_average, cex = 0.6, xlab = "", ylab = "", sub = "",
    main = "Linkage average"
  ),
  nrow = 2
)

features <- penguins[, c("sex", "year", "bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g")]
scaled_features <- scale(features)
k <- 3
kmeans_result <- kmeans(scaled_features, centers = k)
penguins$cluster <- kmeans_result$cluster

plot(penguins[, c(1:6)], col = kmeans_result$cluster)
title("Pair Plot: Scatter Plots of Numeric Variables by Cluster")

fviz_cluster(
  object = kmeans_result, data = penguins, show.clust.cent = TRUE,
  ellipse.type = "euclid", star.plot = TRUE, repel = TRUE
) +
  labs(title = "Resultados clustering K-means") +
  theme_bw() +
  theme(legend.position = "none")

hclust_result <- hclust(dist(scaled_features))

dend_colored <- color_branches(hclust_result, k = k)

labels(dend_colored) <- NULL

plot(dend_colored, cex = 0.6, main = "Hierarchical Clustering Dendrogram")
rect.hclust(hclust_result, k, border = "black")

grid.arrange(
  fviz_dend(
    x = hclust_result,
    k = k,
    color_labels_by_k = TRUE,
    cex = 0.8,
    type = "phylogenic",
    repel = TRUE,
    xlab = "",
    ylab = ""
  ),
  fviz_dend(
    x = hclust_result,
    k = k,
    color_labels_by_k = TRUE,
    cex = 0.5,
    type = "circular",
  ),
  nrow = 1
)

k <- 4
kmeans_result <- kmeans(scaled_features, centers = k)
penguins$cluster <- kmeans_result$cluster

plot(penguins[, c(1:6)], col = kmeans_result$cluster)
title("Pair Plot: Scatter Plots of Numeric Variables by Cluster")

fviz_cluster(
  object = kmeans_result, data = penguins, show.clust.cent = TRUE,
  ellipse.type = "euclid", star.plot = TRUE, repel = TRUE
) +
  labs(title = "Resultados clustering K-means") +
  theme_bw() +
  theme(legend.position = "none")

plot_fviz_cluster <- function(features, dt, cents) {
  plot <- fviz_cluster(
    object = kmeans(features, centers = cents),
    data = dt,
    show.clust.cent = FALSE,
    ellipse.type = "euclid", star.plot = TRUE, repel = TRUE,
    show.dist = FALSE,
    xlab = "",
    ylab = ""
  ) +
    labs(title = paste("K-means ", cents)) +
    theme_bw() +
    theme(legend.position = "none")

  return(plot)
}
grid.arrange(
  plot_fviz_cluster(scaled_features, penguins, 5),
  plot_fviz_cluster(scaled_features, penguins, 6),
  ncol = 2
)

grid.arrange(
  plot_fviz_cluster(scaled_features, penguins, 7),
  plot_fviz_cluster(scaled_features, penguins, 8),
  ncol = 2
)

hclust_result <- hclust(dist(scaled_features))

dend_colored <- color_branches(hclust_result, k = k)

labels(dend_colored) <- NULL

plot(dend_colored, cex = 0.6, main = "Hierarchical Clustering Dendrogram")

rect.hclust(hclust_result, k, border = "black")

grid.arrange(
  fviz_dend(
    x = hclust_result,
    k = k,
    color_labels_by_k = TRUE,
    cex = 0.8,
    type = "phylogenic",
    repel = TRUE,
    xlab = "",
    ylab = ""
  ),
  fviz_dend(
    x = hclust_result,
    k = k,
    color_labels_by_k = TRUE,
    cex = 0.5,
    type = "circular",
    show_labels = FALSE
  ),
  nrow = 1
)

DBSCAN Clustering

dbscan_result <- dbscan(scaled_features, eps = 0.5, MinPts = 5)
fviz_cluster(
  object = dbscan_result, data = scaled_features, stand = FALSE,
  geom = "point", ellipse = TRUE, show.clust.cent = TRUE,
  pallete = "jco",
  main = "DBSCAN Clustering"
) +
  theme_bw() +
  theme(legend.position = "bottom")

Gaussian Mixture Model Clustering

gmm_result <- Mclust(scaled_features)
plot(gmm_result, what = "classification", main = "GMM Clustering")

AGNES Clustering

agnes_result <- agnes(scaled_features)

fviz_dend(agnes_result,
  main = "AGNES Clustering Dendrogram",
  show_labels = FALSE,
  palette = "jama",
  k = 5,
  color_labels_by_k = TRUE,
  ggtheme = theme_classic()
)

OPTICS Clustering

optics_result <- optics(scaled_features, minPts = 5)
plot(optics_result, main = "OPTICS Clustering")

PAM Clustering

pam_result <- pam(dist(scaled_features, method = "euclidean"), 3)
clusplot(pam_result)

pam_result <- pam(dist(scaled_features, method = "euclidean"), 4)
clusplot(pam_result)

Results

comparacion <- clValid(
  obj        = scaled_features,
  nClust     = 2:6,
  clMethods  = c("hierarchical", "kmeans", "pam", "agnes"),
  validation = c("stability", "internal")
)
summary(comparacion)
## 
## Clustering Methods:
##  hierarchical kmeans pam agnes 
## 
## Cluster sizes:
##  2 3 4 5 6 
## 
## Validation Measures:
##                                  2       3       4       5       6
##                                                                   
## hierarchical APN            0.0029  0.0571  0.0549  0.0674  0.1238
##              AD             2.8952  2.5229  2.2262  2.0173  2.0090
##              ADM            0.7357  0.1446  0.2327  0.2605  0.3709
##              FOM            0.9791  0.7540  0.7212  0.6927  0.6929
##              Connectivity   0.1000  3.0290  3.0290  3.0290  5.9579
##              Dunn           0.2793  0.2793  0.3074  0.3276  0.3276
##              Silhouette     0.3684  0.3177  0.3285  0.3522  0.3370
## kmeans       APN            0.0346  0.0357  0.1608  0.1227  0.1397
##              AD             2.5685  2.2502  2.1904  1.9511  1.8755
##              ADM            0.1379  0.2187  0.5525  0.4489  0.4458
##              FOM            0.8010  0.7467  0.7309  0.7017  0.7037
##              Connectivity   0.1000  0.1000 22.5944 22.5944 25.3778
##              Dunn           0.2793  0.2890  0.0751  0.0901  0.0901
##              Silhouette     0.3684  0.3455  0.3296  0.3654  0.3500
## pam          APN            0.3197  0.1342  0.0960  0.1521  0.2153
##              AD             2.9847  2.3744  2.0505  1.9692  1.8878
##              ADM            1.1915  0.5252  0.3418  0.4846  0.5989
##              FOM            0.9691  0.8096  0.7188  0.7156  0.7008
##              Connectivity  57.5786  8.1048  2.9290 31.1913 48.7794
##              Dunn           0.0492  0.0725  0.2428  0.0858  0.1039
##              Silhouette     0.3263  0.3424  0.3668  0.3629  0.3433
## agnes        APN            0.0029  0.0571  0.0549  0.0674  0.1238
##              AD             2.8952  2.5229  2.2262  2.0173  2.0090
##              ADM            0.7357  0.1446  0.2327  0.2605  0.3709
##              FOM            0.9791  0.7540  0.7212  0.6927  0.6929
##              Connectivity   0.1000  3.0290  3.0290  3.0290  5.9579
##              Dunn           0.2793  0.2793  0.3074  0.3276  0.3276
##              Silhouette     0.3684  0.3177  0.3285  0.3522  0.3370
## 
## Optimal Scores:
## 
##              Score  Method       Clusters
## APN          0.0029 hierarchical 2       
## AD           1.8755 kmeans       6       
## ADM          0.1379 kmeans       2       
## FOM          0.6927 hierarchical 5       
## Connectivity 0.1000 hierarchical 2       
## Dunn         0.3276 hierarchical 5       
## Silhouette   0.3684 hierarchical 2